1   package org.apache.lucene.index;
2   
3   /*
4    * Licensed to the Apache Software Foundation (ASF) under one or more
5    * contributor license agreements.  See the NOTICE file distributed with
6    * this work for additional information regarding copyright ownership.
7    * The ASF licenses this file to You under the Apache License, Version 2.0
8    * (the "License"); you may not use this file except in compliance with
9    * the License.  You may obtain a copy of the License at
10   *
11   *     http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing, software
14   * distributed under the License is distributed on an "AS IS" BASIS,
15   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16   * See the License for the specific language governing permissions and
17   * limitations under the License.
18   */
19  
20  import java.io.IOException;
21  import java.util.Arrays;
22  import java.util.Random;
23  
24  import org.apache.lucene.analysis.*;
25  import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
26  import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
27  import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
28  import org.apache.lucene.codecs.Codec;
29  import org.apache.lucene.codecs.TermVectorsReader;
30  import org.apache.lucene.document.Document;
31  import org.apache.lucene.document.Field;
32  import org.apache.lucene.document.FieldType;
33  import org.apache.lucene.document.StoredField;
34  import org.apache.lucene.document.TextField;
35  import org.apache.lucene.search.DocIdSetIterator;
36  import org.apache.lucene.store.Directory;
37  import org.apache.lucene.util.BytesRef;
38  import org.apache.lucene.util.LuceneTestCase;
39  import org.apache.lucene.util.TestUtil;
40  
41  public class TestTermVectorsReader extends LuceneTestCase {
42    //Must be lexicographically sorted, will do in setup, versus trying to maintain here
43    private String[] testFields = {"f1", "f2", "f3", "f4"};
44    private boolean[] testFieldsStorePos = {true, false, true, false};
45    private boolean[] testFieldsStoreOff = {true, false, false, true};
46    private String[] testTerms = {"this", "is", "a", "test"};
47    private int[][] positions = new int[testTerms.length][];
48    private Directory dir;
49    private SegmentCommitInfo seg;
50    private FieldInfos fieldInfos = new FieldInfos(new FieldInfo[0]);
51    private static int TERM_FREQ = 3;
52  
53    private class TestToken implements Comparable<TestToken> {
54      String text;
55      int pos;
56      int startOffset;
57      int endOffset;
58      @Override
59      public int compareTo(TestToken other) {
60        return pos - other.pos;
61      }
62    }
63  
64    TestToken[] tokens = new TestToken[testTerms.length * TERM_FREQ];
65  
66    @Override
67    public void setUp() throws Exception {
68      super.setUp();
69      /*
70      for (int i = 0; i < testFields.length; i++) {
71        fieldInfos.add(testFields[i], true, true, testFieldsStorePos[i], testFieldsStoreOff[i]);
72      }
73      */
74  
75      Arrays.sort(testTerms);
76      int tokenUpto = 0;
77      Random rnd = random();
78      for (int i = 0; i < testTerms.length; i++) {
79        positions[i] = new int[TERM_FREQ];
80        // first position must be 0
81        for (int j = 0; j < TERM_FREQ; j++) {
82          // positions are always sorted in increasing order
83          positions[i][j] = (int) (j * 10 + rnd.nextDouble() * 10);
84          TestToken token = tokens[tokenUpto++] = new TestToken();
85          token.text = testTerms[i];
86          token.pos = positions[i][j];
87          token.startOffset = j * 10;
88          token.endOffset = j * 10 + testTerms[i].length();
89        }
90      }
91      Arrays.sort(tokens);
92  
93      dir = newDirectory();
94      IndexWriter writer = new IndexWriter(
95          dir,
96          newIndexWriterConfig(new MyAnalyzer()).
97              setMaxBufferedDocs(-1).
98              setMergePolicy(newLogMergePolicy(false, 10))
99              .setUseCompoundFile(false)
100     );
101 
102     Document doc = new Document();
103     for(int i=0;i<testFields.length;i++) {
104       FieldType customType = new FieldType(TextField.TYPE_NOT_STORED);
105       if (testFieldsStorePos[i] && testFieldsStoreOff[i]) {
106         customType.setStoreTermVectors(true);
107         customType.setStoreTermVectorPositions(true);
108         customType.setStoreTermVectorOffsets(true);
109       }
110       else if (testFieldsStorePos[i] && !testFieldsStoreOff[i]) {
111         customType.setStoreTermVectors(true);
112         customType.setStoreTermVectorPositions(true);
113       }
114       else if (!testFieldsStorePos[i] && testFieldsStoreOff[i]) {
115         customType.setStoreTermVectors(true);
116         customType.setStoreTermVectorPositions(true);
117         customType.setStoreTermVectorOffsets(true);
118       }
119       else {
120         customType.setStoreTermVectors(true);
121       }
122       doc.add(new Field(testFields[i], "", customType));
123     }
124 
125     //Create 5 documents for testing, they all have the same
126     //terms
127     for(int j=0;j<5;j++) {
128       writer.addDocument(doc);
129     }
130     writer.commit();
131     seg = writer.newestSegment();
132     writer.close();
133 
134     fieldInfos = IndexWriter.readFieldInfos(seg);
135   }
136   
137   @Override
138   public void tearDown() throws Exception {
139     dir.close();
140     super.tearDown();
141   }
142 
143   private class MyTokenizer extends Tokenizer {
144     private int tokenUpto;
145     
146     private final CharTermAttribute termAtt;
147     private final PositionIncrementAttribute posIncrAtt;
148     private final OffsetAttribute offsetAtt;
149     
150     public MyTokenizer() {
151       super();
152       termAtt = addAttribute(CharTermAttribute.class);
153       posIncrAtt = addAttribute(PositionIncrementAttribute.class);
154       offsetAtt = addAttribute(OffsetAttribute.class);
155     }
156     
157     @Override
158     public boolean incrementToken() {
159       if (tokenUpto >= tokens.length) {
160         return false;
161       } else {
162         final TestToken testToken = tokens[tokenUpto++];
163         clearAttributes();
164         termAtt.append(testToken.text);
165         offsetAtt.setOffset(testToken.startOffset, testToken.endOffset);
166         if (tokenUpto > 1) {
167           posIncrAtt.setPositionIncrement(testToken.pos - tokens[tokenUpto-2].pos);
168         } else {
169           posIncrAtt.setPositionIncrement(testToken.pos+1);
170         }
171         return true;
172       }
173     }
174 
175     @Override
176     public void reset() throws IOException {
177       super.reset();
178       this.tokenUpto = 0;
179     }
180   }
181 
182   private class MyAnalyzer extends Analyzer {
183     @Override
184     public TokenStreamComponents createComponents(String fieldName) {
185       return new TokenStreamComponents(new MyTokenizer());
186     }
187   }
188 
189   public void test() throws IOException {
190     //Check to see the files were created properly in setup
191     DirectoryReader reader = DirectoryReader.open(dir);
192     for (LeafReaderContext ctx : reader.leaves()) {
193       SegmentReader sr = (SegmentReader) ctx.reader();
194       assertTrue(sr.getFieldInfos().hasVectors());
195     }
196     reader.close();
197   }
198 
199   public void testReader() throws IOException {
200     TermVectorsReader reader = Codec.getDefault().termVectorsFormat().vectorsReader(dir, seg.info, fieldInfos, newIOContext(random()));
201     for (int j = 0; j < 5; j++) {
202       Terms vector = reader.get(j).terms(testFields[0]);
203       assertNotNull(vector);
204       assertEquals(testTerms.length, vector.size());
205       TermsEnum termsEnum = vector.iterator();
206       for (int i = 0; i < testTerms.length; i++) {
207         final BytesRef text = termsEnum.next();
208         assertNotNull(text);
209         String term = text.utf8ToString();
210         //System.out.println("Term: " + term);
211         assertEquals(testTerms[i], term);
212       }
213       assertNull(termsEnum.next());
214     }
215     reader.close();
216   }
217   
218   public void testDocsEnum() throws IOException {
219     TermVectorsReader reader = Codec.getDefault().termVectorsFormat().vectorsReader(dir, seg.info, fieldInfos, newIOContext(random()));
220     for (int j = 0; j < 5; j++) {
221       Terms vector = reader.get(j).terms(testFields[0]);
222       assertNotNull(vector);
223       assertEquals(testTerms.length, vector.size());
224       TermsEnum termsEnum = vector.iterator();
225       PostingsEnum postingsEnum = null;
226       for (int i = 0; i < testTerms.length; i++) {
227         final BytesRef text = termsEnum.next();
228         assertNotNull(text);
229         String term = text.utf8ToString();
230         //System.out.println("Term: " + term);
231         assertEquals(testTerms[i], term);
232         
233         postingsEnum = TestUtil.docs(random(), termsEnum, postingsEnum, PostingsEnum.NONE);
234         assertNotNull(postingsEnum);
235         int doc = postingsEnum.docID();
236         assertEquals(-1, doc);
237         assertTrue(postingsEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
238         assertEquals(DocIdSetIterator.NO_MORE_DOCS, postingsEnum.nextDoc());
239       }
240       assertNull(termsEnum.next());
241     }
242     reader.close();
243   }
244 
245   public void testPositionReader() throws IOException {
246     TermVectorsReader reader = Codec.getDefault().termVectorsFormat().vectorsReader(dir, seg.info, fieldInfos, newIOContext(random()));
247     BytesRef[] terms;
248     Terms vector = reader.get(0).terms(testFields[0]);
249     assertNotNull(vector);
250     assertEquals(testTerms.length, vector.size());
251     TermsEnum termsEnum = vector.iterator();
252     PostingsEnum dpEnum = null;
253     for (int i = 0; i < testTerms.length; i++) {
254       final BytesRef text = termsEnum.next();
255       assertNotNull(text);
256       String term = text.utf8ToString();
257       //System.out.println("Term: " + term);
258       assertEquals(testTerms[i], term);
259 
260       dpEnum = termsEnum.postings(dpEnum, PostingsEnum.ALL);
261       assertNotNull(dpEnum);
262       int doc = dpEnum.docID();
263       assertEquals(-1, doc);
264       assertTrue(dpEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
265       assertEquals(dpEnum.freq(), positions[i].length);
266       for (int j = 0; j < positions[i].length; j++) {
267         assertEquals(positions[i][j], dpEnum.nextPosition());
268       }
269       assertEquals(DocIdSetIterator.NO_MORE_DOCS, dpEnum.nextDoc());
270 
271       dpEnum = termsEnum.postings(dpEnum, PostingsEnum.ALL);
272       doc = dpEnum.docID();
273       assertEquals(-1, doc);
274       assertTrue(dpEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
275       assertNotNull(dpEnum);
276       assertEquals(dpEnum.freq(), positions[i].length);
277       for (int j = 0; j < positions[i].length; j++) {
278         assertEquals(positions[i][j], dpEnum.nextPosition());
279         assertEquals(j*10, dpEnum.startOffset());
280         assertEquals(j*10 + testTerms[i].length(), dpEnum.endOffset());
281       }
282       assertEquals(DocIdSetIterator.NO_MORE_DOCS, dpEnum.nextDoc());
283     }
284 
285     Terms freqVector = reader.get(0).terms(testFields[1]); //no pos, no offset
286     assertNotNull(freqVector);
287     assertEquals(testTerms.length, freqVector.size());
288     termsEnum = freqVector.iterator();
289     assertNotNull(termsEnum);
290     for (int i = 0; i < testTerms.length; i++) {
291       final BytesRef text = termsEnum.next();
292       assertNotNull(text);
293       String term = text.utf8ToString();
294       //System.out.println("Term: " + term);
295       assertEquals(testTerms[i], term);
296       assertNotNull(termsEnum.postings(null));
297       assertNotNull(termsEnum.postings(null, PostingsEnum.ALL));
298     }
299     reader.close();
300   }
301 
302   public void testOffsetReader() throws IOException {
303     TermVectorsReader reader = Codec.getDefault().termVectorsFormat().vectorsReader(dir, seg.info, fieldInfos, newIOContext(random()));
304     Terms vector = reader.get(0).terms(testFields[0]);
305     assertNotNull(vector);
306     TermsEnum termsEnum = vector.iterator();
307     assertNotNull(termsEnum);
308     assertEquals(testTerms.length, vector.size());
309     PostingsEnum dpEnum = null;
310     for (int i = 0; i < testTerms.length; i++) {
311       final BytesRef text = termsEnum.next();
312       assertNotNull(text);
313       String term = text.utf8ToString();
314       assertEquals(testTerms[i], term);
315 
316       dpEnum = termsEnum.postings(dpEnum, PostingsEnum.ALL);
317       assertNotNull(dpEnum);
318       assertTrue(dpEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
319       assertEquals(dpEnum.freq(), positions[i].length);
320       for (int j = 0; j < positions[i].length; j++) {
321         assertEquals(positions[i][j], dpEnum.nextPosition());
322       }
323       assertEquals(DocIdSetIterator.NO_MORE_DOCS, dpEnum.nextDoc());
324 
325       dpEnum = termsEnum.postings(dpEnum, PostingsEnum.ALL);
326       assertTrue(dpEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
327       assertNotNull(dpEnum);
328       assertEquals(dpEnum.freq(), positions[i].length);
329       for (int j = 0; j < positions[i].length; j++) {
330         assertEquals(positions[i][j], dpEnum.nextPosition());
331         assertEquals(j*10, dpEnum.startOffset());
332         assertEquals(j*10 + testTerms[i].length(), dpEnum.endOffset());
333       }
334       assertEquals(DocIdSetIterator.NO_MORE_DOCS, dpEnum.nextDoc());
335     }
336     reader.close();
337   }
338 
339   public void testIllegalIndexableField() throws Exception {
340     Directory dir = newDirectory();
341     MockAnalyzer a = new MockAnalyzer(random());
342     a.setEnableChecks(false);
343     RandomIndexWriter w = new RandomIndexWriter(random(), dir, a);
344     FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
345     ft.setStoreTermVectors(true);
346     ft.setStoreTermVectorPayloads(true);
347     Document doc = new Document();
348     doc.add(new Field("field", "value", ft));
349     try {
350       w.addDocument(doc);
351       fail("did not hit exception");
352     } catch (IllegalArgumentException iae) {
353       // Expected
354       assertEquals("cannot index term vector payloads without term vector positions (field=\"field\")", iae.getMessage());
355     }
356 
357     ft = new FieldType(TextField.TYPE_NOT_STORED);
358     ft.setStoreTermVectors(false);
359     ft.setStoreTermVectorOffsets(true);
360     doc = new Document();
361     doc.add(new Field("field", "value", ft));
362     try {
363       w.addDocument(doc);
364       fail("did not hit exception");
365     } catch (IllegalArgumentException iae) {
366       // Expected
367       assertEquals("cannot index term vector offsets when term vectors are not indexed (field=\"field\")", iae.getMessage());
368     }
369 
370     ft = new FieldType(TextField.TYPE_NOT_STORED);
371     ft.setStoreTermVectors(false);
372     ft.setStoreTermVectorPositions(true);
373     doc = new Document();
374     doc.add(new Field("field", "value", ft));
375     try {
376       w.addDocument(doc);
377       fail("did not hit exception");
378     } catch (IllegalArgumentException iae) {
379       // Expected
380       assertEquals("cannot index term vector positions when term vectors are not indexed (field=\"field\")", iae.getMessage());
381     }
382 
383     ft = new FieldType(TextField.TYPE_NOT_STORED);
384     ft.setStoreTermVectors(false);
385     ft.setStoreTermVectorPayloads(true);
386     doc = new Document();
387     doc.add(new Field("field", "value", ft));
388     try {
389       w.addDocument(doc);
390       fail("did not hit exception");
391     } catch (IllegalArgumentException iae) {
392       // Expected
393       assertEquals("cannot index term vector payloads when term vectors are not indexed (field=\"field\")", iae.getMessage());
394     }
395 
396     ft = new FieldType(TextField.TYPE_NOT_STORED);
397     ft.setStoreTermVectors(true);
398     ft.setStoreTermVectorPayloads(true);
399     doc = new Document();
400     doc.add(new Field("field", "value", ft));
401     try {
402       w.addDocument(doc);
403       fail("did not hit exception");
404     } catch (IllegalArgumentException iae) {
405       // Expected
406       assertEquals("cannot index term vector payloads without term vector positions (field=\"field\")", iae.getMessage());
407     }
408 
409     ft = new FieldType(StoredField.TYPE);
410     ft.setStoreTermVectors(true);
411     doc = new Document();
412     doc.add(new Field("field", "value", ft));
413     try {
414       w.addDocument(doc);
415       fail("did not hit exception");
416     } catch (IllegalArgumentException iae) {
417       // Expected
418       assertEquals("cannot store term vectors for a field that is not indexed (field=\"field\")", iae.getMessage());
419     }
420 
421     ft = new FieldType(StoredField.TYPE);
422     ft.setStoreTermVectorPositions(true);
423     doc = new Document();
424     doc.add(new Field("field", "value", ft));
425     try {
426       w.addDocument(doc);
427       fail("did not hit exception");
428     } catch (IllegalArgumentException iae) {
429       // Expected
430       assertEquals("cannot store term vector positions for a field that is not indexed (field=\"field\")", iae.getMessage());
431     }
432 
433     ft = new FieldType(StoredField.TYPE);
434     ft.setStoreTermVectorOffsets(true);
435     doc = new Document();
436     doc.add(new Field("field", "value", ft));
437     try {
438       w.addDocument(doc);
439       fail("did not hit exception");
440     } catch (IllegalArgumentException iae) {
441       // Expected
442       assertEquals("cannot store term vector offsets for a field that is not indexed (field=\"field\")", iae.getMessage());
443     }
444 
445     ft = new FieldType(StoredField.TYPE);
446     ft.setStoreTermVectorPayloads(true);
447     doc = new Document();
448     doc.add(new Field("field", "value", ft));
449     try {
450       w.addDocument(doc);
451       fail("did not hit exception");
452     } catch (IllegalArgumentException iae) {
453       // Expected
454       assertEquals("cannot store term vector payloads for a field that is not indexed (field=\"field\")", iae.getMessage());
455     }
456 
457     w.close();
458     
459     dir.close();
460   }
461 }